
import os

from s3_datasource import S3Datasource



source_prefix = "s3://llm-process-pperf/ebook_index_v4/ebook/v006/zhongwenzaixian/pdf/"
target_prefix = "s3://llm-pdf-text-1/pdf_gpu_output/zhongwenzaixian/v006/"


if __name__ == "__main__":
    import ray
    runtime_env = {"working_dir": "/cpfs01/user/xurui/doc-infer/ray-pipeline",}

    ds = S3Datasource(source_prefix, target_prefix)
    tasks = ds.get_read_tasks(20)
    print(len(tasks))
